# global options
#knitr::opts_chunk$set(echo=FALSE)
options(DT.options = list(scrollX = TRUE, paging=TRUE, fixedHeader=TRUE, searchHighlight = TRUE))
#load packages
#default
library(DataExplorer);library(data.table);library(dlookr)
library(extrafont);library(formattable);library(GGally);library(here)
library(janitor);library(lubridate);library(naniar)
library(patchwork);library(PerformanceAnalytics)
library(plotly);library(RColorBrewer);library(readxl)
library(skimr);library(tidyverse);library(scales)
#rmarkdown
library(kableExtra)
#ml
library(caret);library(tidymodels);library(h2o);library(glmnet)
In one piped statement:
a = read_csv('train.csv') %>%
mutate(across(where(is.character),as.factor)) %>%
clean_names(.) %>% select(sort(tidyselect::peek_vars())) %>%
select(where(is.factor), where(is.numeric))
split = a %>% initial_split(strata = sale_price)
train = rsample::training(split)
test = rsample::testing(split)
train %>% select(where(is.factor)) %>% head %>% DT::datatable()
train %>% select(where(is.factor)) %>% glimpse
## Rows: 1,097
## Columns: 43
## $ alley <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ bldg_type <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2fmCon, 1F...
## $ bsmt_cond <fct> TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, ...
## $ bsmt_exposure <fct> No, Mn, No, No, Av, Mn, No, No, No, No, Av, No, No, ...
## $ bsmt_fin_type1 <fct> GLQ, GLQ, ALQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, GLQ, Un...
## $ bsmt_fin_type2 <fct> Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, Unf, Un...
## $ bsmt_qual <fct> Gd, Gd, TA, Gd, Ex, Gd, TA, TA, TA, Ex, Gd, TA, TA, ...
## $ central_air <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ condition1 <fct> Norm, Norm, Norm, Norm, Norm, PosN, Artery, Artery, ...
## $ condition2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Artery, No...
## $ electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, FuseF, SBr...
## $ exter_cond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, ...
## $ exter_qual <fct> Gd, Gd, TA, TA, Gd, TA, TA, TA, TA, Ex, Gd, TA, TA, ...
## $ exterior1st <fct> VinylSd, VinylSd, Wd Sdng, VinylSd, VinylSd, HdBoard...
## $ exterior2nd <fct> VinylSd, VinylSd, Wd Shng, VinylSd, VinylSd, HdBoard...
## $ fence <fct> NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, GdWo,...
## $ fireplace_qu <fct> NA, TA, Gd, NA, Gd, TA, TA, TA, NA, Gd, Gd, Fa, NA, ...
## $ foundation <fct> PConc, PConc, BrkTil, Wood, PConc, CBlock, BrkTil, B...
## $ functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, Typ, T...
## $ garage_cond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, ...
## $ garage_finish <fct> RFn, RFn, Unf, Unf, RFn, RFn, Unf, RFn, Unf, Fin, RF...
## $ garage_qual <fct> TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, TA, ...
## $ garage_type <fct> Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, Detc...
## $ heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA...
## $ heating_qc <fct> Ex, Ex, Gd, Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, TA, Ex, ...
## $ house_style <fct> 2Story, 2Story, 2Story, 1.5Fin, 1Story, 2Story, 1.5F...
## $ kitchen_qual <fct> Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, Gd, TA, TA, ...
## $ land_contour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lv...
## $ land_slope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gt...
## $ lot_config <fct> Inside, Inside, Corner, Inside, Inside, Corner, Insi...
## $ lot_shape <fct> Reg, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, IR1, IR...
## $ mas_vnr_type <fct> BrkFace, BrkFace, None, None, Stone, Stone, None, No...
## $ misc_feature <fct> NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA, NA, ...
## $ ms_zoning <fct> RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, RM, ...
## $ neighborhood <fct> CollgCr, CollgCr, Crawfor, Mitchel, Somerst, NWAmes,...
## $ paved_drive <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ pool_qc <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ roof_matl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg...
## $ roof_style <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gab...
## $ sale_condition <fct> Normal, Normal, Abnorml, Normal, Normal, Normal, Abn...
## $ sale_type <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, New, New, WD, WD...
## $ street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave...
## $ utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllP...
(miss_var_summary(train %>% select(where(is.factor))) %>% DT::datatable(options = list(scrollY = '500px')))
sapply(train %>% select(where(is.factor)), n_unique) %>% as.data.frame() %>% arrange(desc(.)) %>% DT::datatable()
sapply(train %>% select(where(is.factor)), unique)
## $alley
## [1] <NA> Pave Grvl
## Levels: Grvl Pave
##
## $bldg_type
## [1] 1Fam 2fmCon Duplex TwnhsE Twnhs
## Levels: 1Fam 2fmCon Duplex Twnhs TwnhsE
##
## $bsmt_cond
## [1] TA Gd <NA> Fa Po
## Levels: Fa Gd Po TA
##
## $bsmt_exposure
## [1] No Mn Av <NA> Gd
## Levels: Av Gd Mn No
##
## $bsmt_fin_type1
## [1] GLQ ALQ Unf Rec BLQ <NA> LwQ
## Levels: ALQ BLQ GLQ LwQ Rec Unf
##
## $bsmt_fin_type2
## [1] Unf BLQ <NA> ALQ Rec LwQ GLQ
## Levels: ALQ BLQ GLQ LwQ Rec Unf
##
## $bsmt_qual
## [1] Gd TA Ex <NA> Fa
## Levels: Ex Fa Gd TA
##
## $central_air
## [1] Y N
## Levels: N Y
##
## $condition1
## [1] Norm PosN Artery Feedr RRNn RRAe RRAn RRNe PosA
## Levels: Artery Feedr Norm PosA PosN RRAe RRAn RRNe RRNn
##
## $condition2
## [1] Norm Artery RRNn Feedr PosA PosN RRAn RRAe
## Levels: Artery Feedr Norm PosA PosN RRAe RRAn RRNn
##
## $electrical
## [1] SBrkr FuseF FuseA FuseP Mix
## Levels: FuseA FuseF FuseP Mix SBrkr
##
## $exter_cond
## [1] TA Gd Fa Ex
## Levels: Ex Fa Gd Po TA
##
## $exter_qual
## [1] Gd TA Ex Fa
## Levels: Ex Fa Gd TA
##
## $exterior1st
## [1] VinylSd Wd Sdng HdBoard BrkFace MetalSd WdShing CemntBd Plywood AsbShng
## [10] Stucco BrkComm AsphShn Stone ImStucc CBlock
## 15 Levels: AsbShng AsphShn BrkComm BrkFace CBlock CemntBd HdBoard ... WdShing
##
## $exterior2nd
## [1] VinylSd Wd Shng HdBoard MetalSd Wd Sdng Plywood CmentBd BrkFace Stucco
## [10] AsbShng ImStucc AsphShn Brk Cmn Stone Other CBlock
## 16 Levels: AsbShng AsphShn Brk Cmn BrkFace CBlock CmentBd HdBoard ... Wd Shng
##
## $fence
## [1] <NA> MnPrv GdWo GdPrv MnWw
## Levels: GdPrv GdWo MnPrv MnWw
##
## $fireplace_qu
## [1] <NA> TA Gd Fa Po Ex
## Levels: Ex Fa Gd Po TA
##
## $foundation
## [1] PConc BrkTil Wood CBlock Slab Stone
## Levels: BrkTil CBlock PConc Slab Stone Wood
##
## $functional
## [1] Typ Min1 Maj1 Min2 Mod Maj2
## Levels: Maj1 Maj2 Min1 Min2 Mod Sev Typ
##
## $garage_cond
## [1] TA Fa <NA> Gd Po Ex
## Levels: Ex Fa Gd Po TA
##
## $garage_finish
## [1] RFn Unf Fin <NA>
## Levels: Fin RFn Unf
##
## $garage_qual
## [1] TA Fa Gd <NA> Ex Po
## Levels: Ex Fa Gd Po TA
##
## $garage_type
## [1] Attchd Detchd BuiltIn CarPort <NA> Basment 2Types
## Levels: 2Types Attchd Basment BuiltIn CarPort Detchd
##
## $heating
## [1] GasA GasW Grav Wall OthW Floor
## Levels: Floor GasA GasW Grav OthW Wall
##
## $heating_qc
## [1] Ex Gd TA Fa
## Levels: Ex Fa Gd Po TA
##
## $house_style
## [1] 2Story 1.5Fin 1Story 1.5Unf SFoyer 2.5Unf SLvl 2.5Fin
## Levels: 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer SLvl
##
## $kitchen_qual
## [1] Gd TA Ex Fa
## Levels: Ex Fa Gd TA
##
## $land_contour
## [1] Lvl Bnk Low HLS
## Levels: Bnk HLS Low Lvl
##
## $land_slope
## [1] Gtl Mod Sev
## Levels: Gtl Mod Sev
##
## $lot_config
## [1] Inside Corner CulDSac FR2 FR3
## Levels: Corner CulDSac FR2 FR3 Inside
##
## $lot_shape
## [1] Reg IR1 IR2 IR3
## Levels: IR1 IR2 IR3 Reg
##
## $mas_vnr_type
## [1] BrkFace None Stone BrkCmn <NA>
## Levels: BrkCmn BrkFace None Stone
##
## $misc_feature
## [1] <NA> Shed Gar2 Othr
## Levels: Gar2 Othr Shed TenC
##
## $ms_zoning
## [1] RL RM C (all) FV RH
## Levels: C (all) FV RH RL RM
##
## $neighborhood
## [1] CollgCr Crawfor Mitchel Somerst NWAmes OldTown BrkSide Sawyer NridgHt
## [10] NAmes MeadowV IDOTRR Edwards Timber SawyerW Veenker ClearCr Gilbert
## [19] NoRidge NPkVill StoneBr Blmngtn BrDale SWISU Blueste
## 25 Levels: Blmngtn Blueste BrDale BrkSide ClearCr CollgCr Crawfor ... Veenker
##
## $paved_drive
## [1] Y N P
## Levels: N P Y
##
## $pool_qc
## [1] <NA> Ex Fa Gd
## Levels: Ex Fa Gd
##
## $roof_matl
## [1] CompShg WdShngl Metal WdShake Membran Tar&Grv Roll
## Levels: ClyTile CompShg Membran Metal Roll Tar&Grv WdShake WdShngl
##
## $roof_style
## [1] Gable Hip Gambrel Mansard Flat Shed
## Levels: Flat Gable Gambrel Hip Mansard Shed
##
## $sale_condition
## [1] Normal Abnorml Partial AdjLand Family Alloca
## Levels: Abnorml AdjLand Alloca Family Normal Partial
##
## $sale_type
## [1] WD New COD ConLD ConLI Con ConLw CWD Oth
## Levels: COD Con ConLD ConLI ConLw CWD New Oth WD
##
## $street
## [1] Pave Grvl
## Levels: Grvl Pave
##
## $utilities
## [1] AllPub NoSeWa
## Levels: AllPub NoSeWa
DataExplorer::plot_bar(train %>% select(where(is.factor)))
train %>% select(where(is.numeric)) %>% head %>% DT::datatable()
train %>% select(where(is.numeric)) %>% glimpse
## Rows: 1,097
## Columns: 37
## $ bedroom_abv_gr <dbl> 3, 3, 3, 1, 3, 3, 2, 2, 3, 4, 3, 2, 2, 2, 3, 3, 3, ...
## $ bsmt_fin_sf1 <dbl> 706, 486, 216, 732, 1369, 859, 0, 851, 906, 998, 0,...
## $ bsmt_fin_sf2 <dbl> 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bsmt_full_bath <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, ...
## $ bsmt_half_bath <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ bsmt_unf_sf <dbl> 150, 434, 540, 64, 317, 216, 952, 140, 134, 177, 14...
## $ enclosed_porch <dbl> 0, 0, 272, 0, 0, 228, 205, 0, 0, 0, 0, 176, 0, 0, 0...
## $ fireplaces <dbl> 0, 1, 1, 0, 1, 2, 2, 2, 0, 2, 1, 1, 0, 0, 0, 1, 1, ...
## $ full_bath <dbl> 2, 2, 1, 1, 2, 2, 2, 1, 1, 3, 2, 1, 1, 2, 1, 2, 1, ...
## $ garage_area <dbl> 548, 608, 642, 480, 636, 484, 468, 205, 384, 736, 8...
## $ garage_cars <dbl> 2, 2, 3, 2, 2, 2, 2, 1, 1, 3, 3, 1, 2, 2, 1, 2, 2, ...
## $ garage_yr_blt <dbl> 2003, 2001, 1998, 1993, 2004, 1973, 1931, 1939, 196...
## $ gr_liv_area <dbl> 1710, 1786, 1717, 1362, 1694, 2090, 1774, 1077, 104...
## $ half_bath <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, ...
## $ lot_area <dbl> 8450, 11250, 9550, 14115, 10084, 10382, 6120, 7420,...
## $ lot_frontage <dbl> 65, 68, 60, 85, 75, NA, 51, 50, 70, 85, 91, NA, 51,...
## $ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mas_vnr_area <dbl> 196, 162, 0, 0, 186, 240, 0, 0, 0, 286, 306, 212, 0...
## $ misc_val <dbl> 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 500, 0, ...
## $ mo_sold <dbl> 2, 9, 2, 10, 8, 11, 4, 1, 2, 7, 8, 5, 7, 10, 5, 9, ...
## $ ms_sub_class <dbl> 60, 60, 70, 50, 20, 60, 50, 190, 20, 60, 20, 20, 45...
## $ open_porch_sf <dbl> 61, 42, 35, 30, 57, 204, 0, 4, 0, 21, 33, 213, 112,...
## $ overall_cond <dbl> 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 8, 5, 6, 5, 7, ...
## $ overall_qual <dbl> 7, 7, 7, 5, 8, 7, 7, 5, 5, 9, 7, 6, 7, 4, 5, 8, 5, ...
## $ pool_area <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ sale_price <dbl> 208500, 223500, 140000, 143000, 307000, 200000, 129...
## $ screen_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_rms_abv_grd <dbl> 8, 6, 7, 5, 7, 7, 8, 5, 5, 11, 7, 5, 5, 6, 6, 7, 6,...
## $ total_bsmt_sf <dbl> 856, 920, 756, 796, 1686, 1107, 952, 991, 1040, 117...
## $ wood_deck_sf <dbl> 0, 0, 0, 40, 255, 235, 90, 0, 0, 147, 160, 0, 48, 0...
## $ x1st_flr_sf <dbl> 856, 920, 961, 796, 1694, 1107, 1022, 1077, 1040, 1...
## $ x2nd_flr_sf <dbl> 854, 866, 756, 566, 0, 983, 752, 0, 0, 1142, 0, 0, ...
## $ x3ssn_porch <dbl> 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ year_built <dbl> 2003, 2001, 1915, 1993, 2004, 1973, 1931, 1939, 196...
## $ year_remod_add <dbl> 2003, 2002, 1970, 1995, 2005, 1973, 1950, 1950, 196...
## $ yr_sold <dbl> 2008, 2008, 2006, 2009, 2007, 2009, 2008, 2008, 200...
(miss_var_summary(train %>% select(where(is.numeric))) %>% DT::datatable(options = list(scrollY = '500px')))
DataExplorer::plot_boxplot(train %>% select(where(is.numeric)), by = 'sale_price')
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).
## Warning: Removed 197 rows containing non-finite values (stat_boxplot).
DataExplorer::plot_histogram(train %>% select(where(is.numeric)))
DataExplorer::plot_density(train %>% select(where(is.numeric)))
GGally::ggcorr(train %>% select(where(is.numeric)), low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)
train %>% ggplot(aes(sale_price)) + geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
train %>% ggplot(aes(sale_price)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(train$sale_price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35311 130000 163000 181841 213500 755000
qqnorm(train$sale_price)
qqline(train$sale_price, lwd = 2, col = 'darkred')
qqnorm(log10(train$sale_price))
qqline(log10(train$sale_price), lwd = 2, col = 'darkred')
Reference: (package recipes)[https://recipes.tidymodels.org/reference/index.html]
(recipe = train %>% recipe(sale_price ~ . ) %>%
#data conversion
step_mutate(mo_sold = factor(mo_sold)) %>%
step_mutate(year_built = factor(year_built)) %>%
step_mutate(year_remod_add = factor(year_remod_add)) %>%
step_mutate(yr_sold = factor(yr_sold)) %>%
step_mutate(garage_yr_blt = factor(garage_yr_blt)) %>% # * has missing data in test set
#data imputation: numeric
step_medianimpute(lot_frontage, mas_vnr_area) %>%
step_unknown(all_nominal(),-all_outcomes()) %>%
#handling potential multicollinearity via filtering
step_corr(all_numeric(),-all_outcomes()) %>%
step_nzv(all_numeric(),-all_outcomes()) %>%
step_zv(all_numeric(),-all_outcomes()) %>%
#center and scale numeric data
step_normalize(all_numeric(),-all_outcomes()) %>%
##!!<NOTE> for these 2 vars, test ds has fewer levels than the train ds, so omit!
step_naomit(year_built, garage_yr_blt)
#dummy variable creation should occur AFTER normalization of numeric vars
#step_dummy(all_nominal(),-all_outcomes(), one_hot = TRUE)
##!!<NOTE> this is annoying, but if I don't use it I keep getting missing vals in test set
#step_pca(all_predictors(),-all_outcomes())
)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 79
##
## Operations:
##
## Variable mutation for mo_sold
## Variable mutation for year_built
## Variable mutation for year_remod_add
## Variable mutation for yr_sold
## Variable mutation for garage_yr_blt
## Median Imputation for lot_frontage, mas_vnr_area
## Unknown factor level assignment for all_nominal(), -all_outcomes()
## Correlation filter on all_numeric(), -all_outcomes()
## Sparse, unbalanced variable filter on all_numeric(), -all_outcomes()
## Zero variance filter on all_numeric(), -all_outcomes()
## Centering and scaling for all_numeric(), -all_outcomes()
## Removing rows with NA values in year_built, garage_yr_blt
(recipe %>% prep())
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 79
##
## Training data contained 1097 data points and 1097 incomplete rows.
##
## Operations:
##
## Variable mutation for mo_sold [trained]
## Variable mutation for year_built [trained]
## Variable mutation for year_remod_add [trained]
## Variable mutation for yr_sold [trained]
## Variable mutation for garage_yr_blt [trained]
## Median Imputation for lot_frontage, mas_vnr_area [trained]
## Unknown factor level assignment for alley, bldg_type, bsmt_cond, ... [trained]
## Correlation filter removed no terms [trained]
## Sparse, unbalanced variable filter removed bsmt_fin_sf2, ... [trained]
## Zero variance filter removed no terms [trained]
## Centering and scaling for bedroom_abv_gr, bsmt_fin_sf1, ... [trained]
## Removing rows with NA values in year_built, garage_yr_blt
#'Using the recipe, prepare & juice the train ds'
juiced.train = recipe %>% prep(retain=TRUE ) %>% juice %>%
select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))
#'Using the recipe, prepare & bake the test ds'
baked.test = recipe %>% prep(retain=TRUE ) %>% bake(test) %>%
select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))
juiced.train %>% head() %>% DT::datatable()
baked.test %>% head %>% DT::datatable()
miss_var_summary(juiced.train)
miss_var_summary(baked.test)
juiced.train %>% select(where(is.factor)) %>% DataExplorer::plot_bar()
## 3 columns ignored with more than 50 categories.
## garage_yr_blt: 97 categories
## year_built: 110 categories
## year_remod_add: 61 categories
juiced.train %>% select(where(is.numeric)) %>% DataExplorer::plot_histogram()
juiced.train %>% select(where(is.numeric)) %>% DataExplorer::plot_boxplot(by = 'sale_price')
GGally::ggcorr(juiced.train %>% select(where(is.numeric)), low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)
#!!<NOTE> to analyze var importance, you need an importance arg
# The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance.
model.rf = ranger::ranger(log10(sale_price) ~ . , data = juiced.train, num.trees = 300, importance = 'impurity_corrected')
model.rf
## Ranger result
##
## Call:
## ranger::ranger(log10(sale_price) ~ ., data = juiced.train, num.trees = 300, importance = "impurity_corrected")
##
## Type: Regression
## Number of trees: 300
## Sample size: 1097
## Number of independent variables: 71
## Mtry: 8
## Target node size: 5
## Variable importance mode: impurity_corrected
## Splitrule: variance
## OOB prediction error (MSE): 0.004251166
## R squared (OOB): 0.8592599
#viz var importance
model.rf.var.imp = model.rf$variable.importance %>% as.matrix() %>% as.data.frame.matrix() %>% rownames_to_column() %>% rename(var = rowname, imp = V1) %>% arrange(-imp)
model.rf.var.imp %>% ggplot(aes(fct_reorder(var, imp), imp)) + geom_col() + coord_flip()
#make predictions
model.rf.preds = as.vector(predict(model.rf, baked.test, seed = 88)$predictions)
## Warning in predict.ranger(model.rf, baked.test, seed = 88): Forest was grown
## with 'impurity_corrected' variable importance. For prediction it is advised to
## grow another forest without this importance setting.
#y, pred
model.rf.df = tibble(y = baked.test$sale_price, preds = 10 ^ model.rf.preds) #undoing log transformation
#viz
model.rf.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
#what is rmse of model? ~31k
model.rf.df %>% yardstick::rmse(truth = y, estimate = preds)
#compare relative performance
#what does rmse look like rel. to sd of response var?
#what was sd of sales price? ~70 - 80k
sd(test$sale_price)
## [1] 73030.31
x.train = juiced.train %>% select(-sale_price) %>% data.matrix()
y.train = log10(juiced.train$sale_price)
x.test = baked.test %>% select(-sale_price) %>% data.matrix()
y.test = log10(baked.test$sale_price)
#---------------------------------------------------(0)ridge
model.ridge = cv.glmnet(
x = x.train, y = y.train,
type.measure = 'mse',
alpha = 0, #ridge regression
family = 'gaussian'
)
model.ridge$lambda.1se #value of lambda that resulted in simplest model
## [1] 0.1006081
model.ridge$lambda.min #value of lambda that resulted in the smallest sum of squares
## [1] 0.01426093
model.ridge.preds = predict(model.ridge, s=model.ridge$lambda.min, new = x.test) ^ 10
#y, pred
model.ridge.df = tibble(y = y.test ^ 10, preds = as.vector(model.ridge.preds)) #undoing log transformation
#viz
model.ridge.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
model.ridge.rmse = sqrt(mean((y.test - model.ridge.preds)^2))
paste0('ridge regression has a rmse of ', formattable::currency(model.ridge.rmse, digits = 0))
## [1] "ridge regression has a rmse of $17,899,151"
#---------------------------------------------------(1)lasso
model.lasso = cv.glmnet(
x = x.train, y = y.train,
type.measure = 'mse',
alpha = 1, #lasso regression
family = 'gaussian'
)
model.lasso$lambda.1se #value of lambda that resulted in simplest model
## [1] 0.005007287
model.lasso$lambda.min #value of lambda that resulted in the smallest sum of squares
## [1] 0.001639661
model.lasso.preds = predict(model.lasso, s=model.lasso$lambda.min, new = x.test) ^ 10
#y, pred
model.lasso.df = tibble(y = y.test ^ 10, preds = as.vector(model.lasso.preds)) #undoing log transformation
#viz
model.lasso.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
model.lasso.rmse = sqrt(mean((y.test - model.lasso.preds)^2))
paste0('lasso regression has a rmse of ', formattable::currency(model.lasso.rmse, digits = 0))
## [1] "lasso regression has a rmse of $17,966,379"
#---------------------------------------------------(0.5)elastic
model.elastic = cv.glmnet(
x = x.train, y = y.train,
type.measure = 'mse',
alpha = 0.5, #elastic regression
family = 'gaussian'
)
model.elastic$lambda.1se #value of lambda that resulted in simplest model
## [1] 0.01001457
model.elastic$lambda.min #value of lambda that resulted in the smallest sum of squares
## [1] 0.002722551
model.elastic.preds = predict(model.elastic, s=model.elastic$lambda.min, new = x.test) ^ 10
#y, pred
model.elastic.df = tibble(y = y.test ^ 10, preds = as.vector(model.elastic.preds)) #undoing log transformation
#viz
model.elastic.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
model.elastic.rmse = sqrt(mean((y.test - model.elastic.preds)^2))
paste0('elastic regression has a rmse of ', formattable::currency(model.elastic.rmse, digits = 0))
## [1] "elastic regression has a rmse of $17,943,946"
N = 50
(model.rf.var.imp.topN = model.rf.var.imp %>% head(N))
#filter data to top `r N ` important vars
juiced.train.filt.imp.vars = juiced.train %>% select(sale_price, model.rf.var.imp.topN$var)
#create rf model
model.rf2 = ranger::ranger(log10(sale_price) ~ . , data = juiced.train.filt.imp.vars, num.trees = 300, importance = 'impurity_corrected')
#check out model
model.rf2
## Ranger result
##
## Call:
## ranger::ranger(log10(sale_price) ~ ., data = juiced.train.filt.imp.vars, num.trees = 300, importance = "impurity_corrected")
##
## Type: Regression
## Number of trees: 300
## Sample size: 1097
## Number of independent variables: 50
## Mtry: 7
## Target node size: 5
## Variable importance mode: impurity_corrected
## Splitrule: variance
## OOB prediction error (MSE): 0.00398237
## R squared (OOB): 0.8681587
#check out factor importance
model.rf2.var.imp = model.rf2$variable.importance %>% as.matrix() %>% as.data.frame.matrix() %>% rownames_to_column() %>% rename(var = rowname, imp = V1) %>% arrange(-imp)
model.rf2.var.imp %>% ggplot(aes(fct_reorder(var, imp), imp)) + geom_col() + coord_flip()
#create preds
model.rf2.preds = as.vector(predict(model.rf2, baked.test, seed = 88)$predictions)
## Warning in predict.ranger(model.rf2, baked.test, seed = 88): Forest was grown
## with 'impurity_corrected' variable importance. For prediction it is advised to
## grow another forest without this importance setting.
#y, pred
model.rf2.df = tibble(y = baked.test$sale_price, preds = 10 ^ model.rf2.preds) #undoing log transformation
#viz
model.rf.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
#what is rmse of model? ~32k
model.rf2.df %>% yardstick::rmse(truth = y, estimate = preds)
Commentary
#elastic net introduction: https://sciphy-stats.com/post/2019-01-25-finalizing-glmnet-models/
#troubleshooting NA RMSE: https://stackoverflow.com/questions/51548255/caret-there-were-missing-values-in-resampled-performance-measures
#10 fold repeatedcv
trControl = caret::trainControl(method = 'cv', number = 10)
#hyperparameter grid
tuneGrid = expand.grid(
lambda = c(seq(0.01,.2,.005)),
alpha = 0 #ridge
)
model2.ridge = caret::train(
x = x.train, y = y.train,
metric = 'RMSE',
method = 'glmnet',
trControl = trControl,
tuneGrid = tuneGrid
)
#pluck best hyperparameter
bestTune = purrr::pluck(model2.ridge, 'bestTune')
# Plot hyperparameter Lambda against RMSE
plot(model2.ridge)
#finalize model using best hyperparameter
model2.ridge = glmnet::glmnet(
x = x.train, y = y.train,
family = 'gaussian',
alpha = bestTune$alpha, lambda = bestTune$lambda
)
broom::tidy(model2.ridge) %>% select(term, estimate) %>%
mutate_at('estimate', round, 2) %>% arrange(-estimate) %>% DT::datatable()
model2.ridge.preds = predict(model2.ridge, x.test) ^ 10
#y, pred
model2.ridge.df = tibble(y = y.test ^ 10, preds = as.vector(model2.ridge.preds)) #undoing log transformation
#viz
model2.ridge.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
model2.ridge.rmse = sqrt(mean((y.test - model2.ridge.preds)^2))
paste0('ridge regression has a rmse of ', formattable::currency(model2.ridge.rmse, digits = 0))
## [1] "ridge regression has a rmse of $17,986,152"
model2.ridge.var.imp = coef(model2.ridge) %>% as.data.frame.matrix() %>% rename(coef = s0) %>% arrange(-coef)
model2.ridge.var.imp %>% DT::datatable()
(h2o AutoML documentation)[https://docs.h2o.ai/h2o-tutorials/latest-stable/h2o-world-2017/automl/index.html]
start h2o
## #preprocess entire dataset
## h2o.df = recipe %>% prep(retain=TRUE) %>% bake(a)
##
## #select only the vars that were considered important
## h2o.df = h2o.df %>% select(sale_price, model.rf.var.imp.topN$var)
## #check for missing vars
## miss_var_summary(h2o.df)
## # Initialise h2o cluster
## h2o.init()
## # Convert data to h2o frame
## h2o.hf <- as.h2o(h2o.df)
## # describe dataset
## h2o.describe(h2o.hf)
## # identify target and features
## y <- 'sale_price'
## x <- setdiff(colnames(h2o.df), y)
## # split data into train & validation sets
## sframe <- h2o.splitFrame(h2o.hf, seed = 88)
## train <- sframe[[1]]
## test <- sframe[[2]]
##
## aml <- h2o.automl(y = y,
## training_frame = train,
## leaderboard_frame = test,
## max_runtime_secs = 30,
## seed = 88
## #project_name = "XXX"
## )
##
## #view leaderboard
## print(aml@leaderboard)
##
## #make predictions
## aml.preds = predict(aml@leader, test)
## print(h2o.performance(aml@leader, test)) #23967.5